library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lavaan)
## This is lavaan 0.6-16
## lavaan is FREE software! Please report any bugs.
library(semTools)
##
## ###############################################################################
## This is semTools 0.5-6
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
##
## Attaching package: 'semTools'
##
## The following object is masked from 'package:readr':
##
## clipboard
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.1
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
holdout <- arrow::read_feather("ignore.data-holdout-set-item-similarity-20230710-164559")
llm_holdout_meta <- arrow::read_feather("ignore.llmdata-holdout-set-item-similarity-20230710-164559.feather")
holdout_real <- holdout %>%
select(ItemStemIdA, ItemStemIdB, Pearson) %>%
left_join(llm_holdout_meta %>% select(ItemStemIdA = ItemStemId, VariableA = Variable, InstrumentA = instrument, ScaleA = scale_0, SubscaleA = scale_1)) %>%
left_join(llm_holdout_meta %>% select(ItemStemIdB = ItemStemId, VariableB = Variable, InstrumentB = instrument, ScaleB = scale_0, SubscaleB = scale_1))
## Joining with `by = join_by(ItemStemIdA)`
## Joining with `by = join_by(ItemStemIdB)`
holdout_llm <- holdout %>%
select(ItemStemIdA, ItemStemIdB, CosineSimilarity) %>%
left_join(llm_holdout_meta %>% select(ItemStemIdA = ItemStemId, VariableA = Variable, InstrumentA = instrument, ScaleA = scale_0, SubscaleA = scale_1)) %>%
left_join(llm_holdout_meta %>% select(ItemStemIdB = ItemStemId, VariableB = Variable, InstrumentB = instrument, ScaleB = scale_0, SubscaleB = scale_1))
## Joining with `by = join_by(ItemStemIdA)`
## Joining with `by = join_by(ItemStemIdB)`
cors_llm <- holdout_llm %>%
drop_na(InstrumentA, ScaleA, InstrumentB, ScaleB) %>%
select(x = VariableA, y = VariableB, r = CosineSimilarity) %>%
as.data.frame() |>
igraph::graph_from_data_frame(directed = FALSE) |>
igraph::as_adjacency_matrix(attr = "r", sparse = FALSE)
diag(cors_llm) <- 1
cors_real <- holdout_real %>%
drop_na(InstrumentA, ScaleA, InstrumentB, ScaleB) %>%
select(x = VariableA, y = VariableB, r = Pearson) %>%
as.data.frame() |>
igraph::graph_from_data_frame(directed = FALSE) |>
igraph::as_adjacency_matrix(attr = "r", sparse = FALSE)
diag(cors_real) <- 1
scales <- llm_holdout_meta %>%
drop_na(instrument, scale_0) %>%
mutate(scale = str_replace_all(paste(instrument, scale_0), "[^a-zA-Z_0-9]", "_")) %>%
group_by(scale) %>%
summarise(
items = list(Variable),
lvn = paste(first(scale), " =~ ", paste(Variable, collapse = " + "))) %>%
drop_na()
random_scales <- list()
for(i in 1:100) {
n_items <- rpois(1, 10)
random_scales[[i]] <- llm_holdout_meta %>%
drop_na(instrument, scale_0) %>%
sample_n(n_items) %>%
mutate(scale = paste0("random", i)) %>%
group_by(scale) %>%
summarise(
items = list(Variable),
lvn = paste(first(scale), " =~ ", paste(Variable, collapse = " + "))) %>%
drop_na()
}
random_scales <- bind_rows(random_scales)
scales <- bind_rows(scales, random_scales)
find_reverse_items <- function(rs) {
# Calculate the mean correlation for each item, excluding the item's correlation with itself.
mean_correlations <- apply(rs, 1, function(x) mean(x[-which(x == 1)]))
# Identify items with negative mean correlation
# You may adjust the threshold according to your specific criteria.
threshold <- -0.01
reverse_keyed_items <- names(which(mean_correlations < threshold))
# Now you know which items are likely to be reverse-coded.
reverse_keyed_items
}
find_reverse_items_by_first_item <- function(rs) {
# negatively correlated with first item
items <- rs[-1, 1]
reverse_keyed_items <- names(items)[which(items < 0)]
reverse_keyed_items
}
reverse_items <- function(rs, reverse_keyed_items) {
# Reverse the correlations for the reverse-keyed items
for (item in reverse_keyed_items) {
# Get the index of the reverse-keyed item
item_index <- which(rownames(rs) == item)
# Reverse the correlations
rs[item_index, ] <- rs[item_index, ] * -1
rs[, item_index] <- rs[, item_index] * -1
# Since the diagonal is the correlation of the item with itself, set it back to 1
rs[item_index, item_index] <- 1
}
rs
}
scales <- scales %>%
rowwise() %>%
mutate(r_real = list(cors_real[items, items]),
r_llm = list(cors_llm[items, items])) %>%
mutate(reverse_items = list(find_reverse_items_by_first_item(r_real)),
r_real_rev = list(reverse_items(r_real, reverse_items)),
r_llm_rev = list(reverse_items(r_llm, reverse_items))) %>%
mutate(
cfa_real = list(cfa(lvn, sample.cov = r_real_rev, sample.nobs = 1000)),
rel_real = semTools::compRelSEM(cfa_real)) %>%
mutate(
cfa_llm = list(cfa(lvn, sample.cov = r_llm_rev, sample.nobs = 1000)),
rel_llm = semTools::compRelSEM(cfa_llm))
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `cfa_real = list(cfa(lvn, sample.cov = r_real_rev, sample.nobs =
## 1000))`.
## ℹ In row 35.
## Caused by warning in `lavaan::lavaan()`:
## ! lavaan WARNING:
## the optimizer (NLMINB) claimed the model converged, but not all
## elements of the gradient are (near) zero; the optimizer may not
## have found a local solution use check.gradient = FALSE to skip
## this check.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
## Warning: There were 10 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `cfa_llm = list(cfa(lvn, sample.cov = r_llm_rev, sample.nobs =
## 1000))`.
## ℹ In row 37.
## Caused by warning in `lavaan::lavaan()`:
## ! lavaan WARNING:
## the optimizer warns that a solution has NOT been found!
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 9 remaining warnings.
scales <- scales %>% filter(between(as.vector(rel_llm), 0, 1)) %>% filter(between(as.vector(rel_real), 0, 1))
cor.test(scales$rel_llm, scales$rel_real)
##
## Pearson's product-moment correlation
##
## data: scales$rel_llm and scales$rel_real
## t = 13.447, df = 120, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6930576 0.8376234
## sample estimates:
## cor
## 0.7753024
# scales %>% filter(scale == "LOT_Optimism") %>% pull(cfa_real) %>% .[[1]] %>% summary
broom::tidy(cor.test(holdout$CosineSimilarity, holdout$Pearson))
## # A tibble: 1 × 8
## estimate statistic p.value parameter conf.low conf.high method alternative
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr> <chr>
## 1 0.697 293. 0 90949 0.694 0.700 Pearson's… two.sided
holdout %>%
mutate(items = str_c(ItemStemTextA, "\n", ItemStemTextB)) %>%
ggplot(., aes(CosineSimilarity, Pearson, label = items)) +
geom_abline(linetype = "dashed") +
geom_point(size = 0.1, alpha = 0.1) +
scale_color_viridis_d(guide = "none", end = 0.8) +
ylab("Actual correlation") +
theme_bw() +
xlab("LLM cosine similarity") +
ggtitle("Predicting correlations from LLMs, r = 0.7") +
guides(color = "none") +
coord_fixed(xlim = c(-1,1), ylim = c(-1,1)) -> p
p

ggplotly(p)